# Load packages
library(DESeq2) # ‘1.46.0’
library(dplyr)
library(ggplot2)
library(org.Hs.eg.db) # library(org.Mm.eg.db) for mouse
library(ComplexHeatmap)
library(circlize) # for color scales
library(grid) # for gpar()
library(EnhancedVolcano)
# Load count_matrix
args(read.delim) # header = TRUE by default
function (file, header = TRUE, sep = "\t", quote = "\"", dec = ".",
fill = TRUE, comment.char = "", ...)
NULL
rawCounts <- read.delim("http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv")
# Format rawCounts as required by DESeq2
# 1. Set gene ID column as rowname
rownames(rawCounts) <- rawCounts$Gene.ID
# DESeq2 expects the rawCounts to contain only raw, unnormalized integer values, with no non-numeric columns.
# 2. Remove columns with non numerical values
rawCounts_bkup <- rawCounts
rawCounts <- rawCounts[, -c(1, 2)]
class(rawCounts) # a data.frame
[1] "data.frame"
# 3. Convert rawCounts as a matrix
rawCounts <- as.matrix(rawCounts)
# Note: we can do both steps above in one line of code
# rawCounts <- as.matrix(rawCounts[, sapply(rawCounts, is.numeric)])
# checks
is.matrix(rawCounts) # should be TRUE
is.numeric(rawCounts) # should be TRUE
# # Filtering: remove genes with 0 reads in all samples.
dim(rawCounts)
[1] 65217 54
dim(rawCounts[which(rowSums(rawCounts)>0),])
[1] 44316 54
rawCounts <- rawCounts[which(rowSums(rawCounts)>0),]
# Load colData (a dataframe describing samples)
colData <- read.delim("http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-experiment-design.tsv")
# Format colData as required by DESeq2
# 1. Set sample ID (here Run column) as rowname
rownames(colData) <- colData$Run
# 2. Remove unnecessary columns
# Tissue type is the primary condition of interest
# Individual IDs are treated as control variables to account for inter-individual variation
keep <- c("Sample.Characteristic.biopsy.site.", "Sample.Characteristic.individual.")
colData <- colData[, keep]
# 3. Rename columns
colnames(colData) <- c("tissue_type", "individual_id")
# 4. Verify that the sample IDs match exactly and are in the same order between rawCounts and colData
all(rownames(colData) == colnames(rawCounts)) # should return TRUE
[1] TRUE
head(colData)
# 5. convert both columns of colData as factors
class(colData$individual_id) # "character"
[1] "character"
colData$individual_id <- factor(colData$individual_id)
class(colData$individual_id) # "factor"
[1] "factor"
colData$tissue_type <- factor(colData$tissue_type)
# DESeq2 uses the first factor level as reference
levels(colData$tissue_type) # reference = "colorectal cancer metastatic in the liver"
[1] "colorectal cancer metastatic in the liver"
[2] "normal"
[3] "primary tumor"
# Comparisons: "normal" and "primary tumor" vs reference
# We want to compare "colorectal cancer metastatic in the liver" and "primary tumor" against "normal" instead
# 6. Change levels order such that reference = "normal"
levels(colData$tissue_type) <- c(
"normal",
"primary tumor",
"colorectal cancer metastatic in the liver"
)
levels(colData$tissue_type)
[1] "normal"
[2] "primary tumor"
[3] "colorectal cancer metastatic in the liver"
# Create the DEseq2DataSet object
# Design: control variable first, condition of interest second
# DESeq2 adjusts for individual variation before testing tissue_type differences
dds <- DESeqDataSetFromMatrix(countData = rawCounts, colData = colData, design = ~ individual_id + tissue_type)
dds # prints summary of the DESeqDataSet
class: DESeqDataSet
dim: 44316 54
metadata(1): version
assays(1): counts
rownames(44316): ENSG00000000003 ENSG00000000005 ... ENSG00000281918
ENSG00000281920
rowData names(0):
colnames(54): SRR975551 SRR975552 ... SRR975603 SRR975604
colData names(2): tissue_type individual_id
colData(dds) # shows sample annotations (e.g., tissue_type, individual_id)
DataFrame with 54 rows and 2 columns
tissue_type individual_id
<factor> <factor>
SRR975551 colorectal cancer metastatic in the liver AMC_2
SRR975552 colorectal cancer metastatic in the liver AMC_3
SRR975553 colorectal cancer metastatic in the liver AMC_5
SRR975554 colorectal cancer metastatic in the liver AMC_6
SRR975555 colorectal cancer metastatic in the liver AMC_7
... ... ...
SRR975600 normal AMC_20
SRR975601 normal AMC_21
SRR975602 normal AMC_22
SRR975603 normal AMC_23
SRR975604 normal AMC_24
counts(dds)[1:5, 1:5] # view first few genes and samples
SRR975551 SRR975552 SRR975553 SRR975554 SRR975555
ENSG00000000003 6617 1352 1492 3390 1464
ENSG00000000005 69 1 20 23 12
ENSG00000000419 2798 714 510 1140 1667
ENSG00000000457 486 629 398 239 383
ENSG00000000460 466 342 73 227 193
# Run the DESeq pipeline
dds <- DESeq(dds)
# Variance Stabilizing Transformation and plot PCA
vsd <- vst(dds, blind = FALSE)
plotPCA(vsd, intgroup="tissue_type")+ theme_grey()

# Note: contrast = c(factor, numerator, denominator)
# means Compute log2 fold change as numerator / denominator.
# contrast = c("tissue_type", "primary tumor", "normal"): positive values mean higher in tumor, negative mean higher in normal
# contrast = c("tissue_type", "normal", "primary tumor"): positive values mean higher in normal, negative mean higher in tumor
# Extract results
res1 <- results(dds, contrast = c("tissue_type", "primary tumor", "normal"))
res1
log2 fold change (MLE): tissue_type primary tumor vs normal
Wald test p-value: tissue type primary.tumor vs normal
DataFrame with 44316 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSG00000000003 1824.881 -0.441529 0.180629 -2.44439 1.45095e-02 3.48599e-02
ENSG00000000005 10.852 0.540095 0.435418 1.24040 2.14826e-01 3.30104e-01
ENSG00000000419 725.173 -1.009012 0.135824 -7.42881 1.09583e-13 3.96955e-12
ENSG00000000457 311.358 0.321168 0.113114 2.83934 4.52069e-03 1.26684e-02
ENSG00000000460 126.362 -1.357771 0.211021 -6.43429 1.24048e-10 2.26492e-09
... ... ... ... ... ... ...
ENSG00000281909 0.2767389 0.3742786 2.936548 0.1274553 0.898580 NA
ENSG00000281910 0.0537801 0.0416978 3.668177 0.0113674 0.990930 NA
ENSG00000281912 10.2974190 0.1583833 0.227051 0.6975657 0.485449 0.614118
ENSG00000281918 0.2310112 0.2723037 3.664070 0.0743173 0.940758 NA
ENSG00000281920 0.2112458 0.0399116 3.666410 0.0108857 0.991315 NA
# Remove rows with NA
res1 <- na.omit(res1)
res1.df <- as.data.frame(res1)
res1.df
# get gene names for gene IDs
res1.df$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res1.df), keytype = "ENSEMBL", column = "SYMBOL")
res1.df <- res1.df[!is.na(res1.df$symbol),]
res2 <- results(dds, contrast = c("tissue_type", "colorectal cancer metastatic in the liver", "normal"))
res2 <- na.omit(res2)
res2.df <- as.data.frame(res2)
res2.df
res2.df$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res2.df), keytype = "ENSEMBL", column = "SYMBOL")
res2.df <- res2.df[!is.na(res2.df$symbol),]
# Volcano plots 1
EnhancedVolcano(res1.df, x = 'log2FoldChange', y = 'padj', lab = res1.df$symbol,
pCutoff = 0.05, FCcutoff = 1)

# Volcano plots 2
EnhancedVolcano(res2.df, x = 'log2FoldChange', y = 'padj', lab = res2.df$symbol,
pCutoff = 0.05, FCcutoff = 1)

# Volcano plots using ggplot2
res1.df$expression <- "NOT"
res1.df$expression[res1.df$padj < 0.01 & res1.df$log2FoldChange > 2] <- "UP"
res1.df$expression[res1.df$padj < 0.01 & res1.df$log2FoldChange < -2] <- "DOWN"
ggplot(data = res1.df, aes(x = log2FoldChange, y = -log10(padj), col = expression)) +
geom_vline(xintercept = c(-2, 2), col = "gray", linetype = 'dashed') +
geom_hline(yintercept = -log10(0.01), col = "gray", linetype = 'dashed') +
geom_point(size = 2) +
scale_color_manual(values = c("#3498DB", "grey", "#E74C3C"),
labels = c("Downregulated", "Not significant", "Upregulated")) +
theme_classic()

res2.df$expression <- "NOT"
res2.df$expression[res2.df$padj < 0.01 & res2.df$log2FoldChange > 2] <- "UP"
res2.df$expression[res2.df$padj < 0.01 & res2.df$log2FoldChange < -2] <- "DOWN"
ggplot(data = res2.df, aes(x = log2FoldChange, y = -log10(padj), col = expression)) +
geom_vline(xintercept = c(-2, 2), col = "gray", linetype = 'dashed') +
geom_hline(yintercept = -log10(0.01), col = "gray", linetype = 'dashed') +
geom_point(size = 2) +
scale_color_manual(values = c("#3498DB", "grey", "#E74C3C"),
labels = c("Downregulated", "Not significant", "Upregulated")) +
theme_classic()

# Find significant genes
sigs1.df <- res1.df[res1.df$padj < 0.01,]
sigs1.df <- sigs1.df[(abs(sigs1.df$log2FoldChange) > 2),]
sigs1.df <- sigs1.df[sigs1.df$baseMean > 50,] # - average normalized count for a gene across all samples
sigs1.df
NA
NA
sigs2.df <- res2.df[res2.df$padj < 0.01,]
sigs2.df <- sigs2.df[(abs(sigs2.df$log2FoldChange) > 2),]
sigs2.df <- sigs2.df[sigs2.df$baseMean > 50,]
sigs2.df
# Combine significant genes from both comparisons
sigs_combined <- rbind(sigs1.df, sigs2.df)
# Get unique gene IDs
unique_genes <- unique(c(rownames(sigs1.df), rownames(sigs2.df)))
# Subset by unique rownames
sigs_combined <- sigs_combined[unique_genes, ]
# Extract normalized counts for those genes
mat <- counts(dds, normalized=T)[rownames(sigs_combined),] # or mat <- assay(vsd)[rownames(sigs_combined), ]
# Z-score transform each gene across samples
mat.z <- t(apply(mat, 1, scale))
colnames(mat.z) <- rownames(colData)
tissue <- colData$tissue_type
# Color map
tissue_color <- c(
"primary tumor" = "darkorange",
"normal" = "forestgreen",
"colorectal cancer metastatic in the liver" = "firebrick"
)
ha <- HeatmapAnnotation(
Tissue = tissue,
col = list(Tissue = tissue_color)
)
# Plot Heatmap
Heatmap(
mat.z,
name = "Z-score",
top_annotation = ha,
cluster_rows = TRUE,
cluster_columns = TRUE,
show_column_names = TRUE,
show_row_names = FALSE,
row_labels = sigs_combined[rownames(mat.z), "symbol"],# useful when above is TRUE
row_names_gp = gpar(fontsize = 6),
column_names_gp = gpar(fontsize = 6),
heatmap_legend_param = list(title = "Z-score", legend_direction = "horizontal")
)

# Get top 50 genes from both comparisons
topGenes1 <- head(sigs1.df[order(abs(sigs1.df$log2FoldChange), decreasing = TRUE), ], 50)
topGenes2 <- head(sigs2.df[order(abs(sigs2.df$log2FoldChange), decreasing = TRUE), ], 50)
# Combine both sets
topGenes_combined <- rbind(topGenes1, topGenes2)
# Get unique ENSEMBL IDs
unique_genes <- unique(c(rownames(topGenes1), rownames(topGenes2)))
# Subset by unique rownames
topGenes_combined <- topGenes_combined[unique_genes, ]
# Extract normalized counts for those genes
topmat <- counts(dds, normalized=T)[rownames(topGenes_combined),]
topmat.z <- t(apply(topmat, 1, scale)) # Z score transformation of genes across samples.
colnames(topmat.z) <- rownames(colData)
Heatmap(
topmat.z,
name = "Z-score",
top_annotation = ha,
cluster_rows = TRUE,
cluster_columns = TRUE,
show_column_names = TRUE,
show_row_names = TRUE,
row_labels = topGenes_combined[rownames(topmat.z), "symbol"],
row_names_gp = gpar(fontsize = 6),
column_names_gp = gpar(fontsize = 6),
heatmap_legend_param = list(title = "Z-score", legend_direction = "horizontal")
)

---
title: "R Notebook"
output: html_notebook
---

```{r}
# Load packages
library(DESeq2) # ‘1.46.0’
library(dplyr)
library(ggplot2)
library(org.Hs.eg.db) # library(org.Mm.eg.db) for mouse
library(ComplexHeatmap)
library(circlize)   # for color scales
library(grid)       # for gpar()
library(EnhancedVolcano)
```

```{r}
# Load count_matrix
args(read.delim) # header = TRUE by default
rawCounts <- read.delim("http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv")
```

```{r}
# Format rawCounts as required by DESeq2
```

```{r}
# 1. Set gene ID column as rowname
rownames(rawCounts) <- rawCounts$Gene.ID

# DESeq2 expects the rawCounts to contain only raw, unnormalized integer values, with no non-numeric columns.
# 2. Remove columns with non numerical values 
rawCounts_bkup <- rawCounts
rawCounts <- rawCounts[, -c(1, 2)]

class(rawCounts) # a data.frame
# 3. Convert rawCounts as a matrix
rawCounts <- as.matrix(rawCounts)
```
```{r}
# Note: we can do both steps above in one line of code
# rawCounts <- as.matrix(rawCounts[, sapply(rawCounts, is.numeric)])
```

```{r}
# checks
is.matrix(rawCounts) # should be TRUE
is.numeric(rawCounts) # should be TRUE
```

```{r}
# # Filtering: remove genes with 0 reads in all samples.
dim(rawCounts)
dim(rawCounts[which(rowSums(rawCounts)>0),])
rawCounts <- rawCounts[which(rowSums(rawCounts)>0),]
```
```{r}
# Load colData (a dataframe describing samples)
colData <- read.delim("http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-experiment-design.tsv")
```

```{r}
# Format colData as required by DESeq2
```

```{r}
# 1. Set sample ID (here Run column) as rowname
rownames(colData) <- colData$Run

# 2. Remove unnecessary columns
# Tissue type is the primary condition of interest
# Individual IDs are treated as control variables to account for inter-individual variation
keep <- c("Sample.Characteristic.biopsy.site.", "Sample.Characteristic.individual.")
colData <- colData[, keep]

# 3. Rename columns
colnames(colData) <- c("tissue_type", "individual_id")

#  4. Verify that the sample IDs match exactly and are in the same order between rawCounts and colData
all(rownames(colData) == colnames(rawCounts)) # should return TRUE

head(colData)
```
```{r}
# 5. convert both columns of colData as factors
class(colData$individual_id) # "character"
colData$individual_id <- factor(colData$individual_id)
class(colData$individual_id) # "factor"
colData$tissue_type <- factor(colData$tissue_type)
```
```{r}
# DESeq2 uses the first factor level as reference
levels(colData$tissue_type)  # reference = "colorectal cancer metastatic in the liver"
# Comparisons: "normal" and "primary tumor" vs reference
```
```{r}
# We want to compare "colorectal cancer metastatic in the liver" and  "primary tumor" against "normal" instead
# 6. Change levels order such that  reference = "normal"
levels(colData$tissue_type) <- c(
  "normal",
  "primary tumor",
  "colorectal cancer metastatic in the liver"
)
levels(colData$tissue_type)
```
```{r}
# Create the DEseq2DataSet object
# Design: control variable first, condition of interest second
# DESeq2 adjusts for individual variation before testing tissue_type differences
dds <- DESeqDataSetFromMatrix(countData = rawCounts, colData = colData, design = ~ individual_id + tissue_type)

```

```{r}
dds # prints summary of the DESeqDataSet

```
```{r}
colData(dds)  # shows sample annotations (e.g., tissue_type, individual_id)

```
```{r}
counts(dds)[1:5, 1:5]  # view first few genes and samples
```
```{r}

#  Run the DESeq pipeline
dds <- DESeq(dds)
```

```{r}
# Variance Stabilizing Transformation and  plot PCA
vsd <- vst(dds, blind = FALSE)
plotPCA(vsd, intgroup="tissue_type")+ theme_grey()
```
```{r}
# Note: contrast = c(factor, numerator, denominator)
# means Compute log2 fold change as numerator / denominator.
# contrast = c("tissue_type", "primary tumor", "normal"): positive values mean higher in tumor, negative mean higher in normal
# contrast = c("tissue_type", "normal", "primary tumor"): positive values mean higher in normal, negative mean higher in tumor
```

```{r}
# Extract results
res1 <- results(dds, contrast = c("tissue_type", "primary tumor", "normal"))
res1
```
```{r}
# Remove rows with NA 
res1 <- na.omit(res1)
res1.df <- as.data.frame(res1)
res1.df
```
```{r}
# get gene names for gene IDs
res1.df$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res1.df), keytype = "ENSEMBL", column = "SYMBOL")
res1.df <- res1.df[!is.na(res1.df$symbol),]

```

```{r}
res2 <- results(dds, contrast = c("tissue_type", "colorectal cancer metastatic in the liver", "normal"))
res2 <- na.omit(res2)
res2.df <- as.data.frame(res2)
res2.df
res2.df$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res2.df), keytype = "ENSEMBL", column = "SYMBOL")
res2.df <- res2.df[!is.na(res2.df$symbol),]
```

```{r}
# Volcano plots 1
EnhancedVolcano(res1.df, x = 'log2FoldChange', y = 'padj', lab = res1.df$symbol,
                pCutoff = 0.05, FCcutoff = 1)

```
```{r}
# Volcano plots 2
EnhancedVolcano(res2.df, x = 'log2FoldChange', y = 'padj', lab = res2.df$symbol,
                pCutoff = 0.05, FCcutoff = 1)
```
```{r}
# Volcano plots using ggplot2

res1.df$expression <- "NOT"
res1.df$expression[res1.df$padj < 0.01 & res1.df$log2FoldChange > 2] <- "UP"
res1.df$expression[res1.df$padj < 0.01 & res1.df$log2FoldChange < -2] <- "DOWN"

ggplot(data = res1.df, aes(x = log2FoldChange, y = -log10(padj), col = expression)) +
  geom_vline(xintercept = c(-2, 2), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.01), col = "gray", linetype = 'dashed') +
  geom_point(size = 2) +
  scale_color_manual(values = c("#3498DB", "grey", "#E74C3C"), 
                     labels = c("Downregulated", "Not significant", "Upregulated")) +
  theme_classic()
```
```{r}
res2.df$expression <- "NOT"
res2.df$expression[res2.df$padj < 0.01 & res2.df$log2FoldChange > 2] <- "UP"
res2.df$expression[res2.df$padj < 0.01 & res2.df$log2FoldChange < -2] <- "DOWN"

ggplot(data = res2.df, aes(x = log2FoldChange, y = -log10(padj), col = expression)) +
  geom_vline(xintercept = c(-2, 2), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.01), col = "gray", linetype = 'dashed') +
  geom_point(size = 2) +
  scale_color_manual(values = c("#3498DB", "grey", "#E74C3C"), 
                     labels = c("Downregulated", "Not significant", "Upregulated")) +
  theme_classic()
```

```{r}
# Find significant genes
sigs1.df <- res1.df[res1.df$padj < 0.01,]
sigs1.df <- sigs1.df[(abs(sigs1.df$log2FoldChange) > 2),]
sigs1.df <- sigs1.df[sigs1.df$baseMean > 50,] # -  average normalized count for a gene across all samples
sigs1.df


```

```{r}
sigs2.df <- res2.df[res2.df$padj < 0.01,]
sigs2.df <- sigs2.df[(abs(sigs2.df$log2FoldChange) > 2),]
sigs2.df <- sigs2.df[sigs2.df$baseMean > 50,] 
sigs2.df
```

```{r}
# Combine significant genes from both comparisons
sigs_combined <- rbind(sigs1.df, sigs2.df)
# Get unique gene IDs
unique_genes <- unique(c(rownames(sigs1.df), rownames(sigs2.df)))
# Subset by unique rownames
sigs_combined <- sigs_combined[unique_genes, ]
```

```{r}
# Extract normalized counts for those genes
mat <- counts(dds, normalized=T)[rownames(sigs_combined),] # or mat <- assay(vsd)[rownames(sigs_combined), ]
# Z-score transform each gene across samples
mat.z <- t(apply(mat, 1, scale))
colnames(mat.z) <- rownames(colData) 
```

```{r}
tissue <- colData$tissue_type

# Color map
tissue_color <- c(
  "primary tumor" = "darkorange",
  "normal" = "forestgreen",
  "colorectal cancer metastatic in the liver" = "firebrick"
)

ha <- HeatmapAnnotation(
  Tissue = tissue,
  col = list(Tissue = tissue_color)
)
```

```{r}
# Plot Heatmap
Heatmap(
  mat.z,
  name = "Z-score",
  top_annotation = ha,
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  show_column_names = TRUE,
  show_row_names = FALSE,
  row_labels = sigs_combined[rownames(mat.z), "symbol"],# useful when above is TRUE
  row_names_gp = gpar(fontsize = 6),
  column_names_gp = gpar(fontsize = 6),
  heatmap_legend_param = list(title = "Z-score", legend_direction = "horizontal")
)
```
```{r}
# Get top 50 genes from both comparisons
topGenes1 <- head(sigs1.df[order(abs(sigs1.df$log2FoldChange), decreasing = TRUE), ], 50)
topGenes2 <- head(sigs2.df[order(abs(sigs2.df$log2FoldChange), decreasing = TRUE), ], 50)
```

```{r}
# Combine both sets
topGenes_combined <- rbind(topGenes1, topGenes2)
# Get unique ENSEMBL IDs
unique_genes <- unique(c(rownames(topGenes1), rownames(topGenes2)))
# Subset by unique rownames
topGenes_combined <- topGenes_combined[unique_genes, ]
```

```{r}
# Extract normalized counts for those genes
topmat <- counts(dds, normalized=T)[rownames(topGenes_combined),] 
topmat.z <- t(apply(topmat, 1, scale)) # Z score transformation of genes across samples.
colnames(topmat.z) <- rownames(colData)
```

```{r}
Heatmap(
  topmat.z,
  name = "Z-score",
  top_annotation = ha,
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  show_column_names = TRUE,
  show_row_names = TRUE,
  row_labels = topGenes_combined[rownames(topmat.z), "symbol"],
  row_names_gp = gpar(fontsize = 6),
  column_names_gp = gpar(fontsize = 6),
  heatmap_legend_param = list(title = "Z-score", legend_direction = "horizontal")
)
```

